'''


exec(''.join(open("/home/dalton_m/ppp/pyfiles/ANppp_ldbv1_2019.py", encoding="utf8").readlines()[:]))
nohup python3 /home/dalton_m/ppp/pyfiles/ANppp_ldbv1_2019.py  | tee &
'''


import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
import sys

sys.path.append('/home/dalton_m/payload')
from basicfunctions import *

filename = 'ANppp_ldbv1_2019'

resultsloc1 = "/dataERS/eract/daltonm/results/ppp/"

from datetime import date
datestr = date.today().strftime(format="%Y%m%d")
resultsloc = resultsloc1 + datestr + '/'
if not os.path.exists(resultsloc):
    os.makedirs(resultsloc)


logging.basicConfig(filename=resultsloc + filename + '.txt', level=logging.ERROR,
                    format='%(asctime)s - %(levelname)s - %(message)s')
logging.debug('This message should go to the log file')
logging.info('So should this')
logging.warning('And this, too')
logging.exception('And this, too')
logging.captureWarnings(True)

sys.path.append(resultsloc1 + 'pyfiles/')

sampledict = {
    # file name, sample size, seed number
    1: ['big', 3e6, 1],
    2: ['medium', 1e6, 2],
    3: ['small', 1e5, 3],
}

tlist = sampledict[3]
fsuff = tlist[0]
sampsize = tlist[1]
sampseed = tlist[2]

################################
#### get 2020-2021 LDB numbers to limit 2018 sample to only those still around in 2020
q = 5
moncount = q*3 + 1
tfilename = 'ANnberv3'
df = cudf.read_csv(dataloc + 'pppfiles/' + 'ANppp_ldbv1' + '_step1.csv')
#only keep if it existed at some point in 2019 and 2020
kcond = (df['emp_jan_feb20']>0) & (df['aaemp_19'] > 0)
df = df[kcond]

ldbnums = df['ldb_num'].unique().to_pandas().tolist()
df = None
####################################################
#######get 2018 data
yr = 17
fname2 = 'ANnberv2'
dfta = opennew('ldb/'+ fname2 +'setupdate_0020', '')
dflist = []
for fips in statefips['fips'].unique():
    yr1 = 2000 + yr + 1
    qtr = ''
    own = 5
    cols = ['emp_m' + str(i) for i in range(1, 4)] + ['emp_flag_m' + str(i) for i in range(1, 4)] + ['ein', 'aaemp',
                                                                                                     'wages',
                                                                                                     'naics_code',
                                                                                                     'meei']
    renamedict = {
        'fips': 'fipsstate',
        'naics_code': 'naics',
    }
    df = qcew(yr1, qtr, fips, own, cols).rename(columns=renamedict)
    df = df[df['meei'].isin([1, 3, 4, 5, 6])].drop(columns='meei')
    if q > 4:
        renamedict2 = {'emp_m' + str(i): 'emp_m' + str(i + 12) for i in range(1, 4)}
        renamedict2.update({'emp_m' + str(i) + 'flag': 'emp_m' + str(i + 12) + 'flag' for i in range(1, 4)})
        cols = ['emp_m' + str(i) for i in range(1, 4)] + ['emp_flag_m' + str(i) for i in range(1, 4)] + [
            'ein', 'wages', 'naics_code', 'meei'] + ['wages_flag_q1']
        for q1 in range(5, q + 1):
            dftp1 = qcew(yr1 + 1, qtr, fips, own, cols).rename(columns=renamedict).rename(columns=renamedict2).rename(
                columns={'wages_1': 'wages_' + str(q1)}).rename(
                columns={'wages_flag1': 'wages_flag' + str(q1)})
            kcols2 = list(
                set(list(renamedict.values()) + list(renamedict2.values()) + ['wages_' + str(q1), 'wages_flag' + str(q1), 'ldb_num', ]))
            dftp1 = dftp1[dftp1['meei'].isin([1, 3, 4, 5, 6])][kcols2]
            df = df.merge(dftp1, on=['ldb_num'], how='left')
    tcols = [c[:-2] for c in df if c.endswith('_y')]
    for c in tcols:
        df[c] = df[c + '_x'].fillna(df[c + '_y'])
    df = df.drop(columns=[c + '_x' for c in tcols] + [c + '_y' for c in tcols])
    ###prior year LDB
    yrm1 = 2000 + yr
    # 'ui_acct', 'rpt_unit',
    cols = ['ein', 'aaemp', 'meei', 'wages']
    dfldb = qcew(yrm1, qtr, fips, own, cols).rename(columns={'ui_acct': 'ui_acct',
                                                             'rpt_unit': 'run_num',
                                                             'naics_code': 'naics_ldb'})
    dfldb = dfldb[dfldb['meei'].isin([1, 3, 4, 5, 6])].drop(columns='meei')
    dfldb = dfldb[~(((dfldb['emp_m7'].isnull()) | (dfldb['emp_m7'] == 0)) & (
            (dfldb['emp_m8'].isnull()) | (dfldb['emp_m8'] == 0)) & (
                            (dfldb['emp_m9'].isnull()) | (dfldb['emp_m9'] == 0)) & (
                            (dfldb['emp_m10'].isnull()) | (dfldb['emp_m10'] == 0)) & (
                            (dfldb['emp_m11'].isnull()) | (dfldb['emp_m11'] == 0)) & (
                            (dfldb['emp_m12'].isnull()) | (dfldb['emp_m12'] == 0)))]
    dfldb['aaemp'] = dfldb[['emp_m' + str(i) for i in range(1, 13)]].to_pandas().sum(axis=1)
    dfldb['emp_max'] = dfldb[['emp_m' + str(i) for i in range(1, 13)]].to_pandas().max(axis=1)
    dfldb['tot_wages'] = dfldb[['wages_' + str(i) for i in range(1, 5)]].to_pandas().sum(axis=1)
    ### to get number of months with positive employment
    dfldb['emp_count'] = 0
    # multiply qtr x 4 to make it annual
    for mon in range(1, 13):
        dfldb['emp_count'] = dfldb['emp_count'].to_pandas() + 1 * ((dfldb['emp_m' + str(mon)] > 0)).to_array()

    dfldb['aaemp'] = dfldb['aaemp'] / dfldb['emp_count']
    dfldb['monthly_wage_avg'] = dfldb['tot_wages'] / dfldb['emp_count']

    # closures
    mondict = {
        1: 'jan',
        2: 'feb',
        3: 'mar',
        4: 'apr',
        5: 'may',
        6: 'jun',
        7: 'jul',
        8: 'aug',
        9: 'sep',
        10: 'oct',
        11: 'nov',
        12: 'dec',
    }
    for i, mon in mondict.items():
        dfldb[mon + '_closed'] = 1 * (dfldb['emp_m' + str(i)].to_pandas().fillna(0) == 0)

    closedmons = [mon + '_closed' for i, mon in mondict.items()]

    # relative to 3 months worth of tot_wage_monthly
    # because this is quarterly ratio
    dfldb['janmar_wages_ratio'] = ((dfldb['wages_1'].to_pandas().fillna(0))) / (
                3 * dfldb['monthly_wage_avg'].to_pandas())
    dfldb['aprjun_wages_ratio'] = ((dfldb['wages_2'].to_pandas().fillna(0))) / (
                3 * dfldb['monthly_wage_avg'].to_pandas())
    dfldb['julsep_wages_ratio'] = ((dfldb['wages_3'].to_pandas().fillna(0))) / (
                3 * dfldb['monthly_wage_avg'].to_pandas())
    dfldb['octdec_wages_ratio'] = ((dfldb['wages_4'].to_pandas().fillna(0))) / (
                3 * dfldb['monthly_wage_avg'].to_pandas())
    dfldb['emp_feb_june19'] = dfldb[['emp_m' + str(i) for i in range(2, 7)]].to_pandas().mean(axis=1)
    for c in [  # 'janmar_emp_ratio', 'aprjun_emp_ratio', 'julsep_emp_ratio', 'octdec_emp_ratio',
        'janmar_wages_ratio', 'aprjun_wages_ratio', 'julsep_wages_ratio', 'octdec_wages_ratio',
    ]:
        dfldb[c] = dfldb[c].to_pandas().replace(np.inf, np.NaN).fillna(1)
        cond = (dfldb[c] > 1).to_array()
        dfldb[c] = np.where(cond, 1, 0)
    kcols = ['ldb_num', 'emp_feb_june19',
             # 'janmar_emp_ratio', 'aprjun_emp_ratio', 'julsep_emp_ratio', 'octdec_emp_ratio',
             #'janmar_wages_ratio', 'aprjun_wages_ratio', 'julsep_wages_ratio', 'octdec_wages_ratio',
             'aaemp', 'monthly_wage_avg','ein','emp_max'
             ] + closedmons
    renamedict = {
        'aaemp': 'aaemp_17',
        'monthly_wage_avg': 'monthly_wage_avg_17',
        'emp_max' : 'emp_max_17'
    }
    dfldb = dfldb[kcols].rename(columns=renamedict)

    mc = ['ldb_num']
    # dflist.append(sqlmerge(dft, dfldb, mc))
    df = df.merge(dfldb, on=mc, how='left', indicator=False).to_pandas()
    if 'ein_x' in df:
        df['ein'] = df['ein_x'].fillna(df['ein_y'])
        df = df.drop(columns=['ein_x', 'ein_y'])
    ###merge in EIN
    mc = ['ein']
    aggstats = {
        'aaemp_' + str(yr): 'sum',
        'ldb_num': 'count'
    }
    dflist.append(df.merge(
        dfldb[['ein', 'aaemp_' + str(yr), 'ldb_num']].to_pandas().groupby('ein', as_index=False).agg(aggstats).rename(
            columns={'aaemp_' + str(yr): 'ein_aaemp_' + str(yr), 'ldb_num': 'num_estab_ein'}), on=mc, how='left',
        indicator=False))

dft = None
dfldb = None
df = pd.concat(dflist, ignore_index=True)
dflist = None

# df = sqlmerge(df, dft, ['ldb_num'])
##merge setupdate
df = df.merge(dfta.to_pandas(), how='left', on=['ldb_num'])
# df = sqlmerge(df, dfta, ['ldb_num'])
# dft = None
dfta = None
#keep only if 2020
df = df.loc[df['ldb_num'].isin(ldbnums)]

kwvar = 'monthly_wage_avg_17'


'''
##############
ein / employment edits
---not sure how mcuh of this i need
##############
'''


#filling in missing ein
df['ein_aaemp_17_fill2'] = df['ein_aaemp_17'].fillna(df['aaemp_17'])
df['aaemp_17_fill2'] = df['aaemp_17'].copy()
df['aaemp_17_fill'] = df['aaemp_17'].copy()


####wage per worker
##8/29- this is different than main file - not sure how much of a difference it makes
df['wage_per_worker'] = 12*df['monthly_wage_avg_17'] / (df['aaemp_17_fill2'])
df['wage_per_worker'] = df['wage_per_worker'].replace([np.inf], np.NaN)

wlabs = ['<20k','>20k,<40k','>40k,<60k','>60k,<80k','>80k',]
wlabs = {i:j for i,j in enumerate(wlabs)}
bins = [-1, 20e3, 40e3, 60e3, 80e3, 9e9]
df['wage_class'] = pd.cut(df['wage_per_worker'], bins=bins, labels=wlabs.keys(), right=False).astype('float')


'''
######################################################
ppp data
##################################################################################################
'''


####size class
slabs = categoricalvariables('size')
bins = [-1, 10, 50, 250, 500, 9e9]
df['size_class'] = pd.cut(df['ein_aaemp_17_fill2'], bins=bins, labels=slabs.keys(), right=False).astype('float')

s2labs = ['0', '500']
bins = [-1, 500, 9e9]
df['size_class_2'] = pd.cut(df['ein_aaemp_17_fill2'], bins=bins, labels=s2labs, right=True).astype('float')


cond = ((df['ein_aaemp_17_fill2'] > df['aaemp_17_fill2']))
df['Dmulti'] = np.where(cond, 1, 0)

df['size_class_multi'] = ''
for t in s2labs[1:]:
    t=float(t)
    cond = ((df['size_class_2'] == t) & (df['Dmulti'] == 1))
    cond1 = ((df['size_class_2'] == t) & (df['Dmulti'] == 0))
    df['size_class_multi'] = np.where(cond, 'm' + str(int(t)), np.where(cond1, 's' + str(int(t)), df['size_class_multi']) )

t = 0
cond = (df['size_class_2'] == t)
df['size_class_multi'] = np.where(cond, '<500',df['size_class_multi'])


naicsdict = naics2c()
tdict = {v: k for k, v in categoricalvariables('naics').items()}


df['naics2'] = df['naics'].astype('str').str[:2].map(naicsdict).map(tdict)


####open up PPP

mc = ['ldb_num']
###ldb specific info
dfp = opennew('pppfiles/pppagg2020', ['LoanAmount', 'DateApproved',  'ldb_num']).drop_duplicates(mc).to_pandas()
df = df.merge(dfp, on = mc, how = 'left')

mc = ['ein']
dfp = opennew('pppfiles/pppagg2020', ['LoanAmount_ein', 'DateApproved_ein',  'ein']).drop_duplicates(mc).to_pandas()
df = df.merge(dfp, on = mc, how = 'left')
dfp = None


####open up EIDL

mc = ['ldb_num']
###ldb specific info
dfp = opennew('pppfiles/combinedeidl', ['D_eidl_grant', 'D_eidl_loan', 'ldb_num']).drop_duplicates(mc).to_pandas()
df = df.merge(dfp, on = mc, how = 'left')
dfp = None
###ldb specific info - ppp 2021
rdict = {'LoanAmount' : 'LoanAmount_2021',
         }
dfp = opennew('pppfiles/pppagg2021', ['LoanAmount',  'ldb_num']).drop_duplicates(mc).rename(columns = rdict).to_pandas()
df = df.merge(dfp, on = mc, how = 'left')
dfp = None
#### franchise

einlist = cudf.read_csv(dataloc + 'pppfiles/sba_franchise_fullymerged.csv', sep='|')[
    'ein'].unique().to_pandas().tolist()
tdict = {i : 1 for i in einlist}

df['Dfranchise'] = df['ein'].map(tdict).fillna(0)


#industry edit
df['naics2'] = df['naics'].astype('str').str[:2]
cond1 = (df['naics2'].isin(['31','32','33']))
cond2 = (df['naics2'].isin(['44', '45']))
cond3 = (df['naics2'].isin(['48', '49']))
df['naics2'] = np.where(cond1, '31', np.where(cond2, '44', np.where(cond3, '48', df['naics2'])))

####other LDB info
yr = 17
kcols = ['cutoff_score','owner_code',
         'hhi',  ] + [ 'urban_classification', 'yr_growth',]

mc = ['ldb_num']
dfldb = ldbdata(2000 + yr, kcols, '').to_pandas()
df = df.merge(dfldb, on = mc, how = 'left')

dfldb = None

cond = (df['cutoff_score'] <= 1)
df['Deligible'] = np.where(cond, 1, 0)

cond = (df['LoanAmount'] > 0)
df['Dppp'] = np.where(cond, 1, 0)
# dummy for ppp receipt at EIN level
cond = ((df['LoanAmount'] > 0) | (df['LoanAmount_ein'] > 0))
df['Dppp_ein'] = np.where(cond, 1, 0)
#dummy for ppp 2021 receipt
cond = (df['LoanAmount_2021'] > 0)
df['Dppp2021'] = np.where(cond,1,0)



####if EIN has less than $500 per employee, then treat it as missing
df['amt_peremp_ein'] = df['LoanAmount_ein'] / df['ein_aaemp_17']
cond = (df['amt_peremp_ein'] < 500)
df['LoanAmount_ein'] = np.where(cond, np.NaN, df['LoanAmount_ein'])
df['DateApproved_ein'] = np.where(cond, np.NaN, df['DateApproved_ein'])

###get amount and date approved
df['approval_date'] = pd.to_datetime(df['DateApproved_ein'].fillna(df['DateApproved']), errors='coerce')
df['amount_final'] = df['LoanAmount_ein'].fillna(df['LoanAmount']).fillna(0)

cond = (df['amount_final'] > 0)
df['Dppp_final'] = np.where(cond, 1, 0)


'''
*******************************
here's the plan:
i want to REMOVE eligible establishments that did NOT get a loan. This was a choice and therefore do not make a good control group. 

So the control group then becomes - establishments NOT eligible for PPP, and the not-yet-treated group

Then do a 5% sample
*******************************
'''
#df = df.loc[((df['Dppp_ein'] == 1) | (df['Deligible'] == 0))]
print('***************as of 8/27/22 - 108.59 million, includes government')
print('total 2017 employment')
print(df['aaemp_17'].sum()/1e6)

############# keeping same LDB numbers as used in analysis

samplist = pd.read_csv(dataloc + 'pppfiles/' + filename[:-5] + '_ldbnums.csv')['ldb_num'].tolist()

df = df.loc[((df['ldb_num'].isin(samplist)))].drop_duplicates()



###clean up columns
for c in [i for i in df if i.endswith('_x')]:
    df[c[:-2]] = df[c].fillna(df[c[:-2]+'_y'])
    df = df.drop(columns = [c, c[:-2]+'_y'])

###get months
df['ppp_month'] = pd.to_datetime(df['DateApproved'], errors='coerce').dt.month
df['ppp_day'] = pd.to_datetime(df['DateApproved'], errors='coerce').dt.day
df['ppp_ein_month'] = df['approval_date'].dt.month
df['ppp_ein_day'] = df['approval_date'].dt.day
###### QCEW ref period is 12th of month, so make PPP relative to that
cond = (df['ppp_ein_day'] <= 12)
df['ppp_ein_month'] = np.where(cond, df['ppp_ein_month'] , df['ppp_ein_month'] + 1)
cond = (df['ppp_day'] <= 12)
df['ppp_month'] = np.where(cond, df['ppp_month'] , df['ppp_month'] + 1)

df['ppp_month'] = df['ppp_month'].fillna(0)
df['ppp_ein_month'] = df['ppp_ein_month'].fillna(0)

# editing urban class
cond = (df['urban_classification'] >= 5)
df['urban_classification'] = np.where(cond, 5, df['urban_classification'])
df['urban_classification'] = df['urban_classification'].fillna(7)



#for missings, fill in 0
df['yr_growth'] = df['yr_growth'].fillna(0)


###ein size class
tlabs = ['0','1', '10', '50', '500', '5000']
bins = [-1, 1,10, 50, 500, 5000, 9e9]
df['ein_size_class'] = pd.cut(df['ein_aaemp_17'], bins=bins, labels=tlabs, right=True).astype('float')

### size class
tlabs = ['0', '10', '50', '100',]
bins = [-1, 10, 50, 100, 9e9]
df['size_class'] = pd.cut(df['aaemp_17'], bins=bins, labels=tlabs, right=False).astype('float')

df['size_class_multi'] = ''
for t in tlabs[:]:
    t=float(t)
    cond = ((df['size_class'] == t) & (df['ein_aaemp_17'] > df['aaemp_17']))
    cond1 = ((df['size_class'] == t) & (df['ein_aaemp_17'] <= df['aaemp_17']))
    df['size_class_multi'] = np.where(cond, 'm' + str(int(t)), np.where(cond1, 's' + str(int(t)), df['size_class_multi']) )

####filling in missing HHI
# df['hhi_stnaics2mean'] = df[['hhi','fipsstate', 'naics2']].to_pandas().groupby(['fipsstate', 'naics2'])['hhi'].transform("mean")
# df['hhi'] = df['hhi'].to_pandas().fillna(df['hhi_stnaics2mean'].to_pandas())
# tlabs = ['0', '1000',  '2500',]
# bins = [-1, 1000, 2500, 9e9]
# df['hhi_cut'] = pd.cut(df['hhi'].to_pandas(), bins=bins, labels=tlabs, right=False).astype('float')
#
# tlabs = ['0', '1000',]
# bins = [-1, 1000,  9e9]
# df['hhi_cut2'] = pd.cut(df['hhi'].to_pandas(), bins=bins, labels=tlabs, right=False).astype('float')

###wages
#avg wage cut

bins = [-1, 20e3, 40e3, 60e3, 80e3, 1e999]
wagelist = [i for i in range(1,6)]
df['avg_wages_bin'] = pd.cut(df['wage_per_worker'], bins=bins, labels=wagelist).astype('float')

####age cut
df['age'] = 2020 - df['setup_yr']

bins = [-9e9, 5, 10, 20,9e9]
agelist = ['0', '6', '11', '21']
df['age_bins'] = pd.cut(df['age'], bins=bins, labels=agelist, right=True).astype('float')

####poverty
# dfcest = cudf.read_csv(dataloc + 'pppfiles/countypoverty.csv')
#
#
# mc = ['fipsstate', 'fipscounty']
# df = df.merge(cudf.DataFrame(dfcest), on = mc, how = 'left')
# tlabs = ['0', '10',  '15',]
# bins = [-1, .10, .15,  9e9]
# df['poverty_cut'] = pd.cut(df['pct_poverty'].to_pandas(), bins=bins, labels=tlabs, right=False).astype('float')
#
# df['poverty_cut']= df['poverty_cut'].to_pandas().fillna(999)



####get monthly wage numbers

df['q1_emp'] = df[['emp_m'+str(i) for i in range(1,4)]].sum(axis=1)
df['q2_emp'] = df[['emp_m'+str(i) for i in range(4,7)]].sum(axis=1)
df['q3_emp'] = df[['emp_m'+str(i) for i in range(7,10)]].sum(axis=1)
df['q4_emp'] = df[['emp_m'+str(i) for i in range(10,13)]].sum(axis=1)
df['q5_emp'] = df[['emp_m'+str(i) for i in range(13,16)]].sum(axis=1)
#get proportion of quarter's wage by relative proportion of employment
for tq in range(1,q+1):
    for mon in range((tq-1)*3 + 1, (tq-1)*3 + 1+3):
        df['wage_m' + str(mon)] = df['wages_'+str(tq)] * df['emp_m' + str(mon)] / df['q'+ str(tq) +'_emp']
        cond = ((df['q'+ str(tq) +'_emp']==0) & (df['wage_m' + str(mon)].isnull()) & (df['wages_'+str(tq)] == 0))
        df['wage_m' + str(mon)] = np.where(cond, 0, df['wage_m' + str(mon)])

'''
geography

county info
'''
tcols = ['ldb_num', 'cnty', 'fipsstate']
dfg = pd.concat([pd.read_csv(dataloc + 'ldb/address/ldbaddressesv1_' + str(year) + '.psv', sep='|')[tcols] for year in
                 [2017, 2018]]).drop_duplicates()
dfg['fipscnty'] = dfg['fipsstate'] * 1000 + dfg['cnty']
tdict = dict(zip(dfg['ldb_num'], dfg['fipscnty']))
df['fipscnty'] = df['ldb_num'].map(tdict)
dfg = None
tdict = None


kcols = ['ldb_num', 'fipsstate', 'ein',  'naics',
       'aaemp_17', 'ein_aaemp_17','emp_max_17',
         #'num_estab_ein',
         'setup_yr', 'setup_month',  'size_class', 'size_class_multi', 'naics2', 'LoanAmount',
       'LoanAmount_ein',  'D_eidl_grant', 'D_eidl_loan',
       'Dfranchise', 'cutoff_score', 'yr_growth',
       'fipscnty', 'urban_classification', 'Deligible', 'Dppp','Dppp2021', 'Dppp_ein',
         'amount_final',  'ppp_ein_month',
       'ein_size_class', 'avg_wages_bin', 'age_bins'] + closedmons

df1 = pd.wide_to_long(df[['ldb_num'] + ['emp_m'+str(mon) for mon in range(1,moncount)]].drop_duplicates('ldb_num'), stubnames = 'emp_m', i = ['ldb_num'], j = 'num_month').reset_index()


df1b = pd.wide_to_long(df[['ldb_num'] + ['wage_m'+str(mon) for mon in range(1,moncount)] ].drop_duplicates('ldb_num'), stubnames = 'wage_m', i = ['ldb_num'], j = 'num_month').reset_index()

df = df[kcols]
#keep if all months observed
df1 = df1.loc[df1['emp_m'].notnull()]
df1b = df1b.loc[df1b['wage_m'].notnull()]
# #get rid of respondents with zero employment for all of 2020
# df1['tot_emp'] = df1.groupby('ldb_num')['emp_m'].transform('sum')
# kcond = (df1['tot_emp'] > 0)
# df1 = df1[kcond]

df1['tot_obs'] = df1[['ldb_num', 'emp_m']].groupby('ldb_num')['emp_m'].transform('count')

kcond = (df1['tot_obs'] == 15)
df1 = df1[kcond][['ldb_num', 'emp_m', 'num_month']]

print('are the two sets the same length?')
print(len(df1))
print(len(df1b))
df1 = df1.merge(df1b, on = ['ldb_num', 'num_month'], how = 'left')
df1b = None

#####merge in the prev year info. The idea is to take average of SAME month employment / wages
###use this as baseline
df1['emp_avg'] = np.NaN
df1['wage_avg'] = np.NaN

df1 = df1.merge(pd.read_csv(dataloc + 'ldb/'+'CRldb_ppp_yearempv2' + str(2017) + '.csv'), on = ['ldb_num'], how = 'left', indicator=False)

for j in range(1,moncount):
    cond = (df1['num_month'] == j)
    j1 = ((j-1) % 12) + 1
    df1['emp_avg'] = np.where(cond, df1['emp_m'+str(j1)+'_avg'] , df1['emp_avg'])
    df1['wage_avg'] = np.where(cond, df1['wage_m' + str(j1) + '_avg'], df1['wage_avg'])

df1 = df1[['ldb_num', 'emp_m','wage_m', 'num_month', 'emp_avg', 'wage_avg']]


closedmons = [c for c in df if c.endswith('_closed')]
kcols = ['ldb_num', 'fipsstate',  'ein', 'naics',
       'aaemp_17', 'ein_aaemp_17','emp_max_17',
         #'num_estab_ein',
         'setup_yr', 'setup_month',
        'size_class', 'size_class_multi', 'naics2', 'LoanAmount',
       'LoanAmount_ein',  'D_eidl_grant', 'D_eidl_loan',
       'Dfranchise', 'cutoff_score', 'yr_growth',
       'fipscnty', 'urban_classification', 'Deligible', 'Dppp','Dppp2021', 'Dppp_ein',
         'amount_final',  'ppp_ein_month',
       'ein_size_class', 'avg_wages_bin', 'age_bins'] + closedmons


df = df[kcols].merge(df1, on = ['ldb_num'], how = 'inner')


'''
did close from step above, but need to bring it over to wage info, too
'''
cond = (df['emp_m'] == 0)
df['wage_m'] = np.where(cond, 0, df['wage_m'])

cond = (df['emp_m'] == 0)
conda = (df['emp_m'].isnull())
df['Dclosed'] = np.where(cond, 1, np.where(conda, np.NaN, 0))


df['pct_emp'] = 100 * df['emp_m'] / df['emp_avg']
df['pct_wage'] = 100 * df['wage_m'] / df['wage_avg']
for c in ['emp', 'wage']:
    cond = (df[c+'_avg'] == 0)
    df['pct_'+c] = np.where(cond, 100, df['pct_'+c].replace(np.inf,np.NaN))


###using avg of prev year
df['pct_emp_1']  = (100 * df['emp_m'] / df['aaemp_17']).replace(np.inf,np.NaN)
###using emp of jan 2020
#need to merge in jan emp
mc = ['ldb_num']
dft = df[df['num_month'] == 1][['ldb_num', 'emp_m']].rename(columns = {'emp_m' : 'emp_jan20'}).drop_duplicates(mc)
df = df.merge(dft, on = mc, how='left')
df['pct_emp_2']  = (100 * df['emp_m'] / df['emp_jan20']).replace(np.inf,np.NaN)
dft = None



#dfces.to_pandas().to_csv(dataloc + 'pppfiles/' + filename + '_unbalanced.csv')
kcols = list(set(['naics2', 'fipsstate',  'age_bins', 'avg_wages_bin', 'urban_classification',
                                                  'ein_size_class', 'size_class', 'size_class_multi',  'Dclosed', 'pct_emp','pct_wage',
         'yr_growth', 'ppp_ein_month', 'ldb_num', 'num_month', 'age_bins', 'bank_dist_bins',
         'ein_size_class', 'avg_wages_bin', 'Deligible', 'Dfranchise', 'fipscnty','aaemp_17','cutoff_score','amount_final', 'LoanAmount',
 'LoanAmount_ein','ein','ein_aaemp_17','emp_max_17',
                  #'janmar_emp_ratio', 'aprjun_emp_ratio', 'julsep_emp_ratio', 'octdec_emp_ratio',
                  #'janmar_wages_ratio', 'aprjun_wages_ratio', 'julsep_wages_ratio', 'octdec_wages_ratio',
                  'D_eidl_grant', 'D_eidl_loan', 'emp_m', 'wage_m','Dppp2021',
                  'pct_emp_1', 'pct_emp_2', 'emp_avg','wage_avg',
                  'Dppp_employment_verify','poverty_cut', kwvar, 'hhi_cut','hhi_cut2',
'Dppp_employment_verify_80_99','fipscnty','naics',
                  ]
                 )) + closedmons

kcols = [c for c in kcols if c in df]

### need to fill in 0 for these
for c in [ 'D_eidl_grant', 'D_eidl_loan','Dppp2021']:
    df[c] = df[c].fillna(0)

print("checking to see how many observations are lost by dropping nulls")
print("number of missings for each column")
print(df[kcols].isnull().sum())
print(len(df))
othercols = ['LoanAmount_ein', 'LoanAmount', 'pct_emp_1', 'pct_emp_2', 'emp_avg', 'amount_final', 'fipscnty']
df = df[kcols].dropna(subset = [i for i in kcols if i not in othercols]).drop_duplicates(['ldb_num', 'num_month'])
print(len(df))

#dummy for if state is high/low replacement rate
dfui = pd.read_csv(dataloc + 'rona/ui_replacement_rates_ganongetal.txt',sep='\t').iloc[:-1].rename(columns = {'State' : 'state'})
dfui['state'] = dfui['state'].apply(lambda x: x.strip())
for c in list(dfui.columns)[1:]:
    dfui[c] = pd.to_numeric(dfui[c].apply(lambda x: float(x.replace('%','').replace('(','').replace(')',''))))

dfui['fipsstate'] = dfui['state'].apply(lambda x: stateinfo(x.strip(), 'fips'))
bins = [-1, 141, 154, 1e999]
wagelist = ['<141%', '141-154%', '154%+']
dfui['replacement_bins']  = pd.cut(dfui['replacement rate with FPUC'], bins=bins, labels=wagelist)

tdict = dict(zip(dfui['fipsstate'], dfui['replacement_bins']))
df['replacement_bins'] = df['fipsstate'].map(tdict)

df['num_obs'] = df[['ldb_num', 'fipsstate']].groupby('ldb_num')['fipsstate'].transform('count')

# print("checking to see how many observations are lost with the full 15 month cutoff")
# print(len(df))
totobs = q*3
# kcond = (df['num_obs'] == totobs)
# df = df.loc[kcond]
# print(len(df))

'''
markers and edits for robustness checks
'''
###dummy for if LDB ever has a 0 avg month, to be used for alternative specification
cond = (df['emp_avg'] == 0)
df['D0month'] = np.where(cond, 1,0)
df['D0month'] = df.groupby('ldb_num')['D0month'].transform("sum")
cond = (df['D0month'] == 0)
df['D0month'] = np.where(cond, 1, 0)
###dummy for if pct_emp_1 is valid for all months
cond = (df['pct_emp_1'].notnull())
df['Dpct_emp_1'] = np.where(cond, 1,0)
df['Dpct_emp_1'] = df.groupby('ldb_num')['Dpct_emp_1'].transform("sum")
cond = (df['Dpct_emp_1'] == totobs)
df['Dpct_emp_1'] = np.where(cond, 1, 0)
###dummy for if pct_emp_1 is valid for all months
cond = (df['pct_emp_2'].notnull())
df['Dpct_emp_2'] = np.where(cond, 1,0)
df['Dpct_emp_2'] = df.groupby('ldb_num')['Dpct_emp_2'].transform("sum")
cond = (df['Dpct_emp_2'] == totobs)
df['Dpct_emp_2'] = np.where(cond, 1, 0)

samplecut = .2
###save
###dfces.to_csv(dataloc + 'pppfiles/' + filename + '_cudfversion.csv')
#### get sample cutoffs
ldblist = df['ldb_num'].unique().tolist()
samplist = random.sample(ldblist, int(len(ldblist) * .01 / (samplecut)))
cond = (df['ldb_num'].isin(samplist))
df['D1pctsample'] = np.where(cond, 1, 0)
samplist = random.sample(ldblist, int(len(ldblist) * .05 / (samplecut)))
cond = (df['ldb_num'].isin(samplist))
df['D5pctsample'] = np.where(cond, 1, 0)
samplist = random.sample(ldblist, int(len(ldblist) * .10 / (samplecut)))
cond = (df['ldb_num'].isin(samplist))
df['D10pctsample'] = np.where(cond, 1, 0)

###key dep vars
for c in ['pct_wage','pct_emp', 'Dclosed','Dppp_employment_verify']:
    df['D'+c] = 1

df['Dclosedsum'] = df.groupby('ldb_num')['Dclosed'].transform('sum')
cond = (df['Dclosedsum'] == 0)
df['Dpct_wage2'] = np.where(cond, 1, 0)
df['pct_wage2'] = df['pct_wage'].copy()

'''
****************************************
****************************************
this ties in cnty-naics4 employment and wage
****************************************
****************************************
'''
df['naics4'] = df['naics'].astype('str').str[:4].astype('float')
#df['zip5'] = pd.to_numeric(df['zip5'], errors='coerce')
mc = ['fipscnty', 'naics4']
dft = []
for i in range(1,moncount):
    renamedict = {'emp_m' : 'zn_emp',
                  'emp_19' : 'zn_emp19',}
    dft1 = df[df['num_month'] == i].merge(pd.read_csv(dataloc + 'pppfiles/' + 'CRppp_cntynaicsv1_2019_'+ str(i)+'.csv').drop_duplicates(subset = ['fipscnty', 'naics4']).rename(columns = renamedict), on=mc, how='inner')
    '''
    **************************
    **************************
    **************************
    partly out of laziness, partly out of lack of need
    2019 does not have cnty-naics wage data
    just using 
    **************************
    **************************
    **************************
    '''
    renamedict = {'wage_m': 'zn_wage',
                  'wages_19': 'zn_wage19',}
    dft1 = dft1.merge(pd.read_csv(dataloc + 'pppfiles/' + 'CRppp_cntynaicsv1_wage'+ str(i)+'.csv').drop_duplicates(subset = ['fipscnty', 'naics4']).rename(columns = renamedict), on=mc, how='inner')
    dft1['cntynaics_emp'] = ((dft1['zn_emp'] - dft1['emp_m']) / (dft1['zn_emp19'] - dft1['emp_avg'])).replace([np.inf,-np.inf],1)
    #divide 2019 wage by 3 because it's quarterly
    dft1['cntynaics_wage'] = ((dft1['zn_wage'] - dft1['wage_m']) / ((dft1['zn_wage19']/3) - dft1['wage_avg'])).replace([np.inf, -np.inf],1)
    for c in ['cntynaics_emp', 'cntynaics_wage']:
        cond = (dft1[c] < 0 )
        dft1[c] = np.where(cond, 1, dft1[c])
    dft.append(dft1)
    dft1 = None

###just so these don't get dropped, filling in 1 if zip code is missing
###already dropping missing county above
cond = (df['fipscnty'].isnull())
df['Dmissingcnty'] = np.where(cond, 1, 0)
#add in missing zip codes
dft.append(df[df['Dmissingcnty'] == 1])


dcols = ['zn_emp', 'zn_emp19', 'zn_wage', 'zn_wage19', 'wage_m', 'emp_m']
df = pd.concat(dft).drop(columns = dcols).drop_duplicates(['ldb_num', 'num_month'])
dft = None
###just so these don't get dropped, filling in 1 if zip code is missing
cond = (df['fipscnty'].isnull())
df['Dmissingcnty'] = np.where(cond, 1, 0)
for c in  ['cntynaics_emp', 'cntynaics_wage']:
    df[c] = np.where(cond, 1, df[c])

kcond = ((df['cntynaics_emp'].isnull()) | (df['cntynaics_wage'].isnull()))
df = df[~kcond].rename(columns = {'fipsstate':'fips'})

df.to_csv(dataloc + 'pppfiles/' + filename + '.csv')
print('full data saved')
#ldblist = cudf.read_csv(dataloc + 'pppfiles/' + filename + '_ldbnums.csv')
samplist = random.sample(ldblist, int(3e4))
cond = (df['ldb_num'].isin(samplist))
df['Dsmallsample'] = np.where(cond, 1, 0)

df.loc[(df['Dsmallsample']==1)].to_csv(dataloc + 'pppfiles/' + filename + '_small.csv')


